In [5]:
from tableauscraper import TableauScraper as TS

url = "https://visualizedata.ucop.edu/t/Public/views/TransferbyCCM/Bymajorname"

ts = TS()
ts.loads(url)
workbook = ts.getWorkbook()

for t in workbook.worksheets:
    # Show worksheet name
    print(f"worksheet name : {t.name}") 
worksheet name : Broad Disc Select Label
worksheet name : Campus Broad
worksheet name : Major Table
worksheet name : Yield and Admit
In [2]:
# Find filter values
filters = {}
for t in workbook.worksheets:
    # Show worksheet name
    print(f"name : {t.name}") 
    # Show filters
    print(t.getFilters()) 
    # Save filter settings for later use
    for f in t.getFilters():
        print(f['column'])
        filters[f['column']] = f
name : Broad Disc Select Label
[]
name : Campus Broad
[]
name : Major Table
[{'column': 'Campus', 'ordinal': 0, 'values': ['UCB', 'UCD', 'UCI', 'UCLA', 'UCM', 'UCR', 'UCSB', 'UCSC', 'UCSD'], 'globalFieldName': '[federated.1xtee380yrajk410jnsfd1fsy47h].[none:CMP_LOC_LOC1_SHRT_DESC:nk]', 'selection': ['UCB'], 'selectionAlt': [{'fn': '[federated.1xtee380yrajk410jnsfd1fsy47h].[none:CMP_LOC_LOC1_SHRT_DESC:nk]', 'columnFullNames': ['[Campus]'], 'domainTables': [{'isSelected': True, 'label': 'UCB'}]}]}, {'column': 'T Acad Yr', 'ordinal': 0, 'values': [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021], 'globalFieldName': '[federated.1xtee380yrajk410jnsfd1fsy47h].[none:T_ACAD_YR:ok]', 'selection': [2021], 'selectionAlt': [{'fn': '[federated.1xtee380yrajk410jnsfd1fsy47h].[none:T_ACAD_YR:ok]', 'columnFullNames': ['[T Acad Yr]'], 'domainTables': [{'label': '2012'}]}]}]
Campus
T Acad Yr
name : Yield and Admit
[]
In [3]:
# Get the values for the Campus filter
values = filters['Campus']['values']
values
Out[3]:
['UCB', 'UCD', 'UCI', 'UCLA', 'UCM', 'UCR', 'UCSB', 'UCSC', 'UCSD']
In [6]:
dataYA={}
dataMT={}
dataCB={}
dataBDSL={}

# For each campus, create dataframes for each worksheet
for c in values:
    ws = ts.getWorksheet("Major Table")
    ucWB = ws.setFilter('Campus',c)
    UC = ucWB.getWorksheet('Yield and Admit')
    dataYA[c] = UC.data
    UC = ucWB.getWorksheet('Broad Disc Select Label')
    dataBDSL[c] = UC.data
    UC = ucWB.getWorksheet('Major Table')
    dataMT[c] = UC.data
    dataMT[c]['University'] = c
    UC = ucWB.getWorksheet('Campus Broad')
    dataCB[c] = UC.data
    print(c)
UCB
UCD
UCI
UCLA
UCM
UCR
UCSB
UCSC
UCSD
In [7]:
from collections import Counter

# Count the number of occurances of each major
dataMT.keys()
counter = Counter()
for i in dataMT.keys():
    counter.update(dataMT[i]['Major name-value'].tolist())
In [8]:
values_rate = []
# Find each university's admit rate
for c in values:
    values_rate.append((c, dataYA[c][dataYA[c]['Measure Names-alias'].isin(['Admit rate'])].iat[0,2]))

# Sort universities by admit rate
values_rate.sort(key=lambda x:x[1])
values_rate = [x[0] for x in values_rate]
values_rate
Out[8]:
['UCLA', 'UCB', 'UCI', 'UCSB', 'UCD', 'UCSD', 'UCSC', 'UCM', 'UCR']
In [9]:
import pandas as pd

def create_majors_df(num_colleges):
    """ 
    Create a cleaned up data set with majors that 
    exist in more than num_colleges universities.
    """
    
    # Find majors that occur at more than requested colleges
    majors = [t[0] for t in counter.items() if t[1]>num_colleges]
    
    # Concatenate data frames from all universities
    df = dataMT[values_rate[0]].copy()

    for u in values_rate[1:]:
        df = pd.concat([df, dataMT[u]], axis=0)

    # Drop duplicated columns
    df = df[df['Major name-value'].isin(majors)]
    cols = [m for m in list(df.columns) if 'alias' in m]
    df = df.drop(columns=cols)

    # Clean up column names
    df = df.rename(
        columns=lambda x: x.replace('SUM(','')
        .replace('AGG(','')
        .replace(')','')
        .replace('-value',''))
    return df
In [10]:
# Create data frame with majors that exist in more than 7 UC's
tot_majors = create_majors_df(7)

# Pivot data frame to one column per major and admit rates per campus as cell values
pivot2 = tot_majors.pivot_table(index='University', columns='Major name', values='Admit rate')
pivot2 = pivot2.reindex(values_rate)
pivot2
Out[10]:
Major name Anthropology Chemistry Computer science Economics History Philosophy Physics Sociology
University
UCLA 0.366359 0.230769 0.023535 NaN NaN 0.406593 0.367021 NaN
UCB 0.145695 0.308271 0.039669 0.221339 0.188623 0.510448 0.398907 0.177609
UCI 0.417808 0.408676 0.111836 0.325048 0.685629 0.713725 0.351724 0.428203
UCSB 0.506329 0.561247 0.096636 0.578329 0.515385 0.462633 0.630252 0.539167
UCD 0.711864 0.770270 0.284636 0.594234 0.728682 0.615385 0.538462 0.686567
UCSD NaN 0.580882 0.158818 0.581800 0.727794 0.715415 0.555556 0.625150
UCSC 0.793846 0.454054 0.316677 0.741525 0.822526 0.812834 0.709302 0.667989
UCM 0.794521 NaN NaN 0.739130 0.785714 NaN NaN 0.731183
UCR 0.804651 0.592965 0.379851 0.810219 0.816406 0.847826 0.631148 0.817017
In [11]:
# Create a second data frame with more majors
tot_majors2 = create_majors_df(3)

# Drop unused columns
cols2 = [m for m in list(tot_majors.columns) if m not in ['Major name', 'Admit rate','Applicants']]
tot_majors2 = tot_majors2.drop(columns=cols2)

# Create new column with the partial sum of the number of applicants per major
tot_majors2['Total Applicants'] = tot_majors2.groupby(['Major name']).Applicants.transform('sum')

# Sort rows by new column
tot_majors2 = tot_majors2.sort_values(by=['Total Applicants'], ascending=False)
tot_majors2
Out[11]:
Major name Applicants Admit rate Total Applicants
5 Computer science 1397 0.096636 13297
21 Computer science 1997 0.023535 13297
32 Computer science 1458 0.284636 13297
19 Computer science 1996 0.158818 13297
14 Computer science 1340 0.379851 13297
... ... ... ... ...
76 Geology 20 0.3 182
71 Geology 30 0.7 182
67 Geology 22 0.318182 182
26 Geology 83 0.46988 182
47 Geology 27 0.444444 182

180 rows × 4 columns

In [12]:
# Create data frame with majors that exist in more than 7 UC's
tot_majors3 = create_majors_df(7)

# Create a new column for the lower GPA admit range limit
tot_majors3['Lower GPA'] = tot_majors['Admit GPA range'].str[0:4].astype(float)

# Pivot data frame to one column per major and lower GPA limit per campus as cell values
pivot = tot_majors3.pivot_table(index='University', columns='Major name', values='Lower GPA')
pivot = pivot.reindex(values_rate)
pivot
Out[12]:
Major name Anthropology Chemistry Computer science Economics History Philosophy Physics Sociology
University
UCLA 3.73 3.84 3.96 NaN NaN 3.77 3.84 NaN
UCB 3.67 3.61 3.81 3.69 3.81 3.66 3.54 3.64
UCI 3.69 3.64 3.92 3.79 3.44 3.41 3.65 3.69
UCSB 3.53 3.59 3.97 3.52 3.55 3.54 3.54 3.52
UCD 3.32 3.26 3.79 3.56 3.39 3.48 3.54 3.48
UCSD NaN 3.49 3.92 3.57 3.29 3.30 3.47 3.47
UCSC 3.05 3.16 3.43 3.15 3.05 3.07 3.29 3.06
UCM 3.02 NaN NaN 3.16 2.99 NaN NaN 3.08
UCR 2.98 3.02 3.51 3.07 3.00 3.05 3.03 3.02
In [13]:
import plotly.graph_objects as go

def drop_down_plot(df, title):
    """
    Create an interactive visualization with 
    a drop down menu to select filter.
    Inspired by: Shinichi Okada
    https://towardsdatascience.com/how-to-create-an-interactive-dropdown-in-jupyter-322277f58a68
    """
    
    fig = go.Figure()
    
    # Add points to plot and trace
    for column in df.columns.to_list():
        fig.add_trace(
            go.Scatter(
                x = df.index,
                y = df[column],
                name = column
            )
        )    
    
    # Create the 'All' button that makes all columns visible 
    # by setting 'visible' to True for all columns
    button_all = dict(label = 'All',
                      method = 'update',
                      args = [{'visible': df.columns.isin(df.columns),
                               'title': 'All',
                               'showlegend':True}])

    def create_layout_button(column):
        """
        Create a button for the provided column which will
        make the data associated with the column visible.
        The name property of the plot lines is used to 
        toggle visibility.
        """
        
        return dict(label = column,
                    method = 'update',
                    args = [{'visible': df.columns.isin([column]),
                             'title': column,
                             'showlegend': True}])

    # Create drop down menu and add the 'All'button 
    # and one button for each column in the data frame
    fig.update_layout(
        updatemenus=[go.layout.Updatemenu(
            active = 0,
            buttons = [button_all] + list(df.columns.map(lambda column: create_layout_button(column)))
            )
        ]     
    )
    # Update remaining layout properties
    fig.update_layout(
        title_text=title,
        height=800
        
    )
   
    fig.show()
In [15]:
import seaborn as sns

sns.set(rc={'figure.figsize':(15,12)})

# Plot major admit rates with majors sorted by 
# total number of applicants from low to high
sns.boxplot(data=tot_majors2, x='Admit rate', y='Major name').set(title='Admit Rates of Common Majors')
print('')

In [16]:
# Interactive plot of admit rate by major with 
# UC's sorted by acceptance rate from low to high
drop_down_plot(pivot2, title="Admit Rates Per Campus of Most Common Majors") 
In [17]:
# Interactive plot of lower GPA admit limit per major 
# with UC's sorted by acceptance rate from low to high
drop_down_plot(pivot, title="Lower GPA Admit Limit of Most Common Majors by Campus") 
In [18]:
from scipy import stats

# Test for significance of major with admit rates
group1 = pivot2.iloc[:, 0].dropna().values.tolist()
group2 = pivot2.iloc[:, 1].dropna().values.tolist()
group3 = pivot2.iloc[:, 2].dropna().values.tolist()
group4 = pivot2.iloc[:, 3].dropna().values.tolist()
group5 = pivot2.iloc[:, 4].dropna().values.tolist()
group6 = pivot2.iloc[:, 5].dropna().values.tolist()
group7 = pivot2.iloc[:, 6].dropna().values.tolist()
group8 = pivot2.iloc[:, 7].dropna().values.tolist()

# Conduct Krushal-Wallis test on 8 most common majors
stats.kruskal(group1,group2,group3,group4,group5,group6,group7,group8)

# Since p < 0.05 reject null hypothesis and conclude significant 
# difference between majors in terms of acceptance rate
Out[18]:
KruskalResult(statistic=21.64038461538462, pvalue=0.0029294091653327367)
In [19]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd
In [20]:
tot_majors['Admit rate'] = tot_majors['Admit rate'].astype(float)

# Perform Tukey HSD test to test for pairwise significance
tukey = pairwise_tukeyhsd(endog=tot_majors['Admit rate'],
                          groups=tot_majors['Major name'],
                          alpha=0.05)

# Convert results to a pandas data frame
tukey_df = pd.DataFrame(data=tukey._results_table.data[1:], columns=tukey._results_table.data[0])
tukey_df.rename(columns = {'p-adj':'p_adj'}, inplace = True)
tukey_df

# Conclude statistically significant difference between the means of the admit 
# rates for the major pair if p<0.05, indicated also by True in 'reject' column
Out[20]:
group1 group2 meandiff p_adj lower upper reject
0 Anthropology Chemistry -0.0993 0.9582 -0.3881 0.1896 False
1 Anthropology Computer science -0.3912 0.0028 -0.6884 -0.0939 True
2 Anthropology Economics 0.0063 1.0000 -0.2909 0.3035 False
3 Anthropology History 0.0912 0.9775 -0.2060 0.3884 False
4 Anthropology Philosophy 0.0680 0.9961 -0.2293 0.3652 False
5 Anthropology Physics -0.0448 0.9997 -0.3421 0.2524 False
6 Anthropology Sociology 0.0165 1.0000 -0.2808 0.3137 False
7 Chemistry Computer science -0.2919 0.0459 -0.5808 -0.0031 True
8 Chemistry Economics 0.1056 0.9424 -0.1833 0.3944 False
9 Chemistry History 0.1905 0.4433 -0.0984 0.4793 False
10 Chemistry Philosophy 0.1672 0.6086 -0.1216 0.4561 False
11 Chemistry Physics 0.0544 0.9988 -0.2344 0.3433 False
12 Chemistry Sociology 0.1157 0.9093 -0.1731 0.4046 False
13 Computer science Economics 0.3975 0.0022 0.1003 0.6947 True
14 Computer science History 0.4824 0.0001 0.1852 0.7796 True
15 Computer science Philosophy 0.4592 0.0002 0.1619 0.7564 True
16 Computer science Physics 0.3463 0.0119 0.0491 0.6436 True
17 Computer science Sociology 0.4077 0.0016 0.1104 0.7049 True
18 Economics History 0.0849 0.9850 -0.2123 0.3821 False
19 Economics Philosophy 0.0617 0.9979 -0.2356 0.3589 False
20 Economics Physics -0.0512 0.9994 -0.3484 0.2461 False
21 Economics Sociology 0.0102 1.0000 -0.2871 0.3074 False
22 History Philosophy -0.0232 1.0000 -0.3205 0.2740 False
23 History Physics -0.1360 0.8346 -0.4333 0.1612 False
24 History Sociology -0.0747 0.9930 -0.3720 0.2225 False
25 Philosophy Physics -0.1128 0.9304 -0.4100 0.1844 False
26 Philosophy Sociology -0.0515 0.9993 -0.3487 0.2457 False
27 Physics Sociology 0.0613 0.9979 -0.2359 0.3585 False
In [21]:
# Create interactive plot of Tukey HSD test

# DATAFRAME HEADING FOR DOCS
fig = go.Figure(data=[go.Table(
    header=dict(values=list(tukey_df.columns),
                fill_color='cornflowerblue',
                align='left'),
    cells=dict(values=[tukey_df.group1, 
                       tukey_df.group2, 
                       tukey_df.meandiff, 
                       tukey_df.p_adj,
                       tukey_df.lower, 
                       tukey_df.upper, 
                       tukey_df.reject],
               fill_color='ivory',
               align='left'))
])
fig.update_layout(
    title_text = "Tukey Table for Comparisons of Admit Rate Per Major",
    title_font_size=30,
    font_family="Times New Roman",
    font_color="black",
    title_font_family="Times New Roman",
    title_font_color="black",
)
fig.update_traces(cells_font=dict(size = 12))
fig.show()
In [26]:
tot_majors
Out[26]:
Broad discipline Major name College/School Applicants Enrolls Admits Enroll GPA range Yield rate Admit GPA range Admit rate University
21 Computer & Information sciences Computer science School Of Engineering & applied sci 1997 29 47 3.93 - 4.00 0.617021 3.96 - 4.00 0.023535 UCLA
60 Letters Philosophy College Of Letters & science 364 78 148 3.75 - 4.00 0.527027 3.77 - 4.00 0.406593 UCLA
73 Physical Sciences Chemistry College Of Letters & science 286 35 66 3.83 - 3.99 0.530303 3.84 - 4.00 0.230769 UCLA
77 Physical Sciences Physics College Of Letters & science 188 32 69 3.85 - 3.94 0.463768 3.84 - 3.96 0.367021 UCLA
81 Social Sciences Anthropology College Of Letters & science 434 110 159 3.70 - 3.94 0.691824 3.73 - 3.96 0.366359 UCLA
... ... ... ... ... ... ... ... ... ... ... ...
46 Physical Sciences Chemistry College Of Natural & agricultural sci 199 17 118 2.72 - 3.13 0.144068 3.02 - 3.61 0.592965 UCR
48 Physical Sciences Physics College Of Natural & agricultural sci 122 11 77 2.97 - 3.47 0.142857 3.03 - 3.77 0.631148 UCR
52 Social Sciences Anthropology College Of Humanities & social sci 215 38 173 2.79 - 3.47 0.219653 2.98 - 3.63 0.804651 UCR
54 Social Sciences Economics College Of Humanities & social sci 548 72 444 2.85 - 3.38 0.162162 3.07 - 3.70 0.810219 UCR
59 Social Sciences Sociology College Of Humanities & social sci 1093 189 893 2.89 - 3.35 0.211646 3.02 - 3.66 0.817017 UCR

65 rows × 11 columns

In [30]:
# Convert majors into dummy variables
mlr = pd.get_dummies(data=tot_majors, columns=['Major name'])
mlr

# Set up data for multiple linear regression
x = mlr[['Major name_Anthropology', 'Major name_Chemistry', 
         'Major name_Computer science','Major name_Economics', 
         'Major name_History', 'Major name_Philosophy',
         'Major name_Physics','Major name_Sociology']]
y = mlr['Admit rate']
Out[30]:
Broad discipline College/School Applicants Enrolls Admits Enroll GPA range Yield rate Admit GPA range Admit rate University Major name_Anthropology Major name_Chemistry Major name_Computer science Major name_Economics Major name_History Major name_Philosophy Major name_Physics Major name_Sociology
21 Computer & Information sciences School Of Engineering & applied sci 1997 29 47 3.93 - 4.00 0.617021 3.96 - 4.00 0.023535 UCLA 0 0 1 0 0 0 0 0
60 Letters College Of Letters & science 364 78 148 3.75 - 4.00 0.527027 3.77 - 4.00 0.406593 UCLA 0 0 0 0 0 1 0 0
73 Physical Sciences College Of Letters & science 286 35 66 3.83 - 3.99 0.530303 3.84 - 4.00 0.230769 UCLA 0 1 0 0 0 0 0 0
77 Physical Sciences College Of Letters & science 188 32 69 3.85 - 3.94 0.463768 3.84 - 3.96 0.367021 UCLA 0 0 0 0 0 0 1 0
81 Social Sciences College Of Letters & science 434 110 159 3.70 - 3.94 0.691824 3.73 - 3.96 0.366359 UCLA 1 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
46 Physical Sciences College Of Natural & agricultural sci 199 17 118 2.72 - 3.13 0.144068 3.02 - 3.61 0.592965 UCR 0 1 0 0 0 0 0 0
48 Physical Sciences College Of Natural & agricultural sci 122 11 77 2.97 - 3.47 0.142857 3.03 - 3.77 0.631148 UCR 0 0 0 0 0 0 1 0
52 Social Sciences College Of Humanities & social sci 215 38 173 2.79 - 3.47 0.219653 2.98 - 3.63 0.804651 UCR 1 0 0 0 0 0 0 0
54 Social Sciences College Of Humanities & social sci 548 72 444 2.85 - 3.38 0.162162 3.07 - 3.70 0.810219 UCR 0 0 0 1 0 0 0 0
59 Social Sciences College Of Humanities & social sci 1093 189 893 2.89 - 3.35 0.211646 3.02 - 3.66 0.817017 UCR 0 0 0 0 0 0 0 1

65 rows × 18 columns

In [32]:
from sklearn import linear_model

# Fit linear regression model using sklearn
regr = linear_model.LinearRegression()
regr.fit(x, y)

print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)
Intercept: 
 0.523472810777838
Coefficients: 
 [ 0.04416149 -0.05509447 -0.34701547  0.05048021  0.13537195  0.1121347
 -0.00067636  0.06063795]
In [33]:
import statsmodels.api as sm

# Add a constant
x = sm.add_constant(x) 

# Fit OLS linear regression model using statsmodels
model = sm.OLS(y, x).fit()
predictions = model.predict(x) 
 
print_model = model.summary()
print(print_model)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:             Admit rate   R-squared:                       0.390
Model:                            OLS   Adj. R-squared:                  0.316
Method:                 Least Squares   F-statistic:                     5.215
Date:                Tue, 06 Dec 2022   Prob (F-statistic):           0.000123
Time:                        15:01:10   Log-Likelihood:                 20.346
No. Observations:                  65   AIC:                            -24.69
Df Residuals:                      57   BIC:                            -7.297
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
===============================================================================================
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const                           0.4653      0.021     22.319      0.000       0.424       0.507
Major name_Anthropology         0.1023      0.062      1.637      0.107      -0.023       0.227
Major name_Chemistry            0.0031      0.059      0.052      0.959      -0.116       0.122
Major name_Computer science    -0.2889      0.062     -4.622      0.000      -0.414      -0.164
Major name_Economics            0.1086      0.062      1.738      0.088      -0.016       0.234
Major name_History              0.1935      0.062      3.097      0.003       0.068       0.319
Major name_Philosophy           0.1703      0.062      2.725      0.009       0.045       0.295
Major name_Physics              0.0575      0.062      0.920      0.362      -0.068       0.183
Major name_Sociology            0.1188      0.062      1.901      0.062      -0.006       0.244
==============================================================================
Omnibus:                        4.824   Durbin-Watson:                   0.568
Prob(Omnibus):                  0.090   Jarque-Bera (JB):                4.702
Skew:                          -0.653   Prob(JB):                       0.0953
Kurtosis:                       2.825   Cond. No.                     1.35e+16
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 4.02e-31. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [ ]: